clear all
set more off
cap log close
cd "${master_dir}"
log using "${log_dir}/1b-DefineRadius.log", replace
***************************************************************************************************
* 
* Program: 1b-DefineRadius.do
* Purpose: Define the appropriate radius and append/merge raw data
* Sections:
*     1. Load, append, and merge data
*     2. Clean and label variables and save dataset
*     3. Create CCES comparison dataset
* Files Used:
*     1. Pings_100m_PollingPlace_11_1_16.dta - Pings_100m_PollingPlace_11_16_16.dta
*     2. PollingPlaces2016_w_TimeZones_and_Buildings.dta
* Files Created:
*     1. 1b-DefineRadius.log
*     2. Pings_all_days.dta
*     3. radiusdata.dta
*
***************************************************************************************************

***************************************************************************************************
*  1. Load, append, and merge data
***************************************************************************************************

* Insheet and append each of the (unadjusted) pings (along with distance to nearest polling sites)
use "${raw_dir}/Pings_100m_PollingPlace_11_1_16.dta", clear
forval i=2/16{
    append using "${raw_dir}/Pings_100m_PollingPlace_11_`i'_16.dta"
}

* Merge in local time zone and associated block-group Census FIPS
rename NearestPollingPlace PollingPlace_ID
merge m:1 PollingPlace_ID using "${raw_dir}/PollingPlaces2016_w_TimeZones_and_Buildings", ///
  keepusing(UTC_Offset_nDST UTC_Offset_DST gisjoin state county category subcategory ///
  building_area_m2)

keep if _merge == 3
drop _merge
sort gisjoin

* State & County FIPS from gisjoin
gen statefips = substr(gisjoin,2,2)
gen statecountyfips = substr(gisjoin,2,2) + substr(gisjoin,5,3)

* Use County and State names of the associated GPS coordinate for the polling place
rename state state_original
rename county county_original
destring statecountyfips, gen(statecountyfips_numeric)
countyfips, fips(statecountyfips_numeric) nogen
statastates, fips(state_fips)
keep if _m == 1 | _m == 3
replace county_name = proper(county_name)
replace state_name = proper(state_name)
replace state_name = "District of Columbia" if state_name == "District Of Columbia"
drop state_fips county_fips state_code county_code state_abbrev statecountyfips_numeric _m

***************************************************************************************************
*  2. Clean and label variables and save dataset
***************************************************************************************************

* Generate a Stata date variables using the utc timestamp for each ping
gen double greenwich_date_sec = utc_timestamp*1000 + mdyhms(1,1,1970,0,0,0)
format greenwich_date_sec %tC

* Define non-DST and DST date variable before creating harmonized one
gen double local_date_sec_nDST = greenwich_date_sec - UTC_Offset_nDST*3600000
format local_date_sec_nDST %tC

gen double local_date_sec_DST = greenwich_date_sec - UTC_Offset_DST*3600000
format local_date_sec_DST %tC

* Create an HourOffset (Time Zome) variable that accounts for DST split
gen HourOffset = UTC_Offset_nDST
replace HourOffset = UTC_Offset_DST if local_date_sec_nDST < tC(06nov2016 01:00:00)

* Create a new time variable with the DST adjustment to make sure it gets things right at the changeover
gen double local_date_sec = local_date_sec_nDST
replace local_date_sec = local_date_sec_DST if local_date_sec < tc(06nov2016 01:00:00)
format local_date_sec %tC
drop local_date_sec_nDST local_date_sec_DST UTC_Offset_nDST UTC_Offset_DST

* Drop first and last dates (since only partial coverage once restricted to local time)
gen day = dofC(local_date_sec)
drop if day == td(31oct2016) | day == td(16nov2016)
format day %td

* Label variables
order PollingPlace_ID ID_11_16 local_date_sec day Dist_to_PollingPlace_M Ping_in_ConvexHull ///
  Apple_0_Google_1 Sec_Since_Last_Ping Sec_Till_Next_Ping horizontal_accuracy latitude longitude ///
  geo_hash utc_timestamp greenwich_date_sec HourOffset gisjoin statefips statecountyfips ///
  state_name county_name state_original county_original
  
label var PollingPlace_ID "Polling place ID"
label var ID_11_16 "Phone ID"
label var local_date_sec "Date-time (milliseconds) time-zone adjusted"
label var day "Day of the year (2016)"
label var Dist_to_PollingPlace_M "Distance (meters) to nearest polling place"
label var Ping_in_ConvexHull "Ping located in convex hull of polling site building"
label var Apple_0_Google_1 "Android (0 = iPhone)"
label var Sec_Since_Last_Ping "Seconds since last ping outside of the 100m radius around building"
label var Sec_Till_Next_Ping "Seconds until next ping outside of the 100m radius around building"
label var horizontal_accuracy "Horizontal accuracy of ping location (0 = distance with 0 meters)"
label var latitude "Latitude of ping"
label var longitude "Longitude of ping"
label var geo_hash "Geohash of ping"
label var utc_timestamp "Original date-time variable"
label var greenwich_date_sec "Greenwich mean time of original date-time variable"
label var HourOffset "Hour offset from Greenwich Mean Time"
label var gisjoin "Block-group FIPS"
label var statefips "State FIPS of Polling Place"
label var statecountyfips "County FIPS of Polling Place"
label var state_name "State of Polling Place (GIS)"
label var county_name "County of Polling Place (GIS)"
label var state_original "State of Polling Place (Original)"
label var county_original "County of Polling Place (Original)"
label var category "Polling Place Building Category"
label var subcategory "Polling Place Building Sub-Category"
label var building_area_m2 "Polling Place Building Area (squared meters)"

save "${data_dir}/Pings_all_days.dta", replace

***************************************************************************************************
*  3. Define distance radii groupings (10 to 100) and plot ping counts
***************************************************************************************************

use "${data_dir}/Pings_all_days.dta", clear
keep Dist_to_Polling day ID

forvalues i = 10(10)100 {
    gen DistLess`i' = 0
    replace DistLess`i' = 1 if Dist_to_Polling <= `i'
}

forvalues i = 10(10)100 {
    gegen tag_IDdayDistLess`i' = tag(ID day DistLess`i')
}

forvalues i = 10(10)100 {
    replace tag_IDdayDistLess`i' = 0 if DistLess`i' == 0
}

forvalues i = 10(10)100 {
    gegen dailycount_uniqueID_DistLess`i' = sum(tag_IDdayDistLess`i'), by(day)
}

gegen tag_day = tag(day)

keep if tag_day == 1
save "$data_dir/radiusdata.dta", replace

log close



